library(ggplot2)
library(caTools)
library(plotly)
library(MASS)
library(matlib)
library(dplyr)
library(pROC)
# Set random seed for reproducibility
set.seed(123)
p <- 1 # proportion of correlated predictors
npred <- 3 # number of predictors
z <- 0.2 # z
n <- 200000 # Number of samples in the dataset
a <- 2.115 # the effect size
scenario <- function(){
# set up correlations
corr0 <- matrix(0, npred, npred) # matrix: set up for cov matrix, 0 on diagonals
corr0[1:npred*p, 1:npred*p] = z # class 0
diag(corr0) = 0
# covariance structures
sigma0 <- diag(npred) + corr0 # matrix: cov matrix of class 0
return(sigma0)
}
Simulate data for diseased class and non-diseased class using a covariance matrix and mean vector using the MASS package. The covariance matrix is chosen such that the two predictors are correlated. The mean vector is chosen such that the diseased class has a higher mean than the non-diseased class for both predictors. The data is then combined into a single dataset.
# mean structures
mu0 <- c(rep(1,npred)) # vector: class 0 means
# covariance structures
sigma0 <- scenario() # class 0
# Generate data for non-diseased class
data <- mvrnorm(n, mu = mu0, Sigma = sigma0) %>%
as.data.frame()
# Rename columns to "predictor1", "predictor2" ,...
colnames(data) <- c(paste0("predictor", 1:(ncol(data))))
# Simulate binary y to have Prob(y=1) = 1/[1+exp(-L)]
L <- 0.4*a*data$predictor1 + 0.2*a*data$predictor2 + 0.1*a*data$predictor3 + 0.1*a*data$predictor1*data$predictor2
L <- - mean(L) + 0.4*a*data$predictor1 + 0.2*a*data$predictor2 + 0.1*a*data$predictor3 + 0.1*a*data$predictor1*data$predictor2 # 50/50 split
y <- ifelse(runif(n) < plogis(L), 1, 0)
data <- cbind(data, y)
#proportion y's in data
prop.table(table(data$y))
##
## 0 1
## 0.50812 0.49188
# Plot for Predictor 1
ggplot(data, aes(x = predictor1, fill = as.factor(y))) +
geom_histogram(alpha = 0.5, position = 'identity', bins = 30) +
ggtitle("Distribution of Diseased and Non-Diseased Classes for Predictor 1")
# Plot for Predictor 2
ggplot(data, aes(x = predictor2, fill = as.factor(y))) +
geom_histogram(alpha = 0.5, position = 'identity', bins = 30) +
ggtitle("Distribution of Diseased and Non-Diseased Classes for Predictor 2")
### 3D Histogram of Predictor 1 and Predictor 2
# Calculate densities for class 1 and class 0
density1 <- with(data[data$y == 1, ], kde2d(predictor1, predictor2, n = 30))
density0 <- with(data[data$y == 0, ], kde2d(predictor1, predictor2, n = 30))
# Create the interactive 3D plot
p1 <- plot_ly(z = ~density1$z) %>%
add_surface(
x = ~density1$x,
y = ~density1$y,
colorscale = list(c(0, "red"), list(1, "pink")),
opacity = 0.8
)
p2 <- plot_ly(z = ~density0$z) %>%
add_surface(
x = ~density0$x,
y = ~density0$y,
colorscale = list(c(0, "blue"), list(1, "lightblue")),
opacity = 0.8
)
plot <- subplot(p1, p2, nrows = 1, shareX = TRUE, shareY = TRUE)
plot
n_iter <- 1600 # Number of iterations for logistic regression
auc_values <- numeric(n_iter) # Vector to store AUC values
sample_ratio <- 0.15 # Proportion of data to use in each logistic regression
# Run logistic regression and calculate AUC for n_iter iterations
for(i in 1:n_iter) {
# Sample a subset of the data
sample_index <- sample(1:nrow(data), sample_ratio * nrow(data))
sample_data <- data[sample_index, ]
# Run logistic regression with two predictors
model <- glm(y ~ . + predictor1*predictor2, data = sample_data, family = binomial)
# Predict on the same sample set
predictions <- predict(model, newdata = sample_data, type = "response")
# Calculate AUC
auc <- colAUC(predictions, sample_data$y, plotROC = FALSE)
auc_values[i] <- auc # Directly store the AUC value
# Save the model predictors and coefficients to calculate average coefficients
if(i == 1) {
coefficients <- model$coefficients
} else {
coefficients <- coefficients + model$coefficients
}
}
# Calculate average AUC
average_auc <- mean(auc_values)
print(paste("Average AUC: ", round(average_auc, 4)))
## [1] "Average AUC: 0.8"
# Calculate average coefficients
average_coefficients <- coefficients / n_iter
print(paste("Average coefficients: ", round(average_coefficients, 2)))
## [1] "Average coefficients: -1.73" "Average coefficients: 0.85"
## [3] "Average coefficients: 0.42" "Average coefficients: 0.21"
## [5] "Average coefficients: 0.21"
sample_data <- function(data, size, perc_ones, seed = NULL) {
if (!is.null(seed)) set.seed(seed)
# Separate data into two subsets based on y
data_0 <- filter(data, y == 0)
data_1 <- filter(data, y == 1)
# Calculate the number of samples required from each subset
num_ones <- round(size * perc_ones)
num_zeros <- size - num_ones
# Sample from each subset
sampled_0 <- sample_n(data_0, min(num_zeros, nrow(data_0)))
sampled_1 <- sample_n(data_1, min(num_ones, nrow(data_1)))
# Combine the sampled subsets
sampled_data <- rbind(sampled_0, sampled_1)
# Shuffle the rows
sampled_data <- sampled_data[sample(nrow(sampled_data)), ]
return(sampled_data)
}
# Sample from the population
sample_data_50 <- sample_data(data, 100, 0.5, seed = 28)
sample_data_20 <- sample_data(data, 100, 0.2, seed = 29)
sample_data_5 <- sample_data(data, 100, 0.05, seed = 30)
fit_and_evaluate <- function(original_data, times, sample_size, perc_ones, test_size = 0.2, seed = NULL) {
if (!is.null(seed)) set.seed(seed)
auc_values <- numeric(times) # Vector to store AUC values
roc_curves <- data.frame() # Data frame to store ROC curves points
for (i in 1:times) {
# Sample from the original dataset
data <- sample_data(original_data, size = sample_size, perc_ones = perc_ones, seed = seed)
# Split the data
index <- sample(1:nrow(data), size = round(nrow(data) * test_size))
test_set <- data[index, ]
train_set <- data[-index, ]
# Fit logistic regression model
model <- glm(y ~ ., data = train_set, family = "binomial")
# Predict on test set
predictions <- predict(model, newdata = test_set, type = "response")
# Evaluate model performance using AUC
roc_obj <- suppressMessages(roc(response = test_set$y, predictor = predictions, quiet = TRUE))
auc_values[i] <- auc(roc_obj)
# Store ROC curve points
roc_curves <- rbind(roc_curves, data.frame(
tpr = roc_obj$sensitivities,
fpr = 1 - roc_obj$specificities,
iteration = i
))
}
# Calculate the average AUC
mean_auc <- mean(auc_values)
# Plot all ROC curves
roc_plot <- ggplot(roc_curves, aes(x = fpr, y = tpr, group = iteration, color = as.factor(iteration))) +
geom_line(alpha = 0.3) +
geom_line(data = data.frame(fpr = c(0, 1), tpr = c(0, 1)),
aes(x = fpr, y = tpr), linetype = "dashed", inherit.aes = FALSE) +
theme_minimal() +
labs(color = "Iteration") +
ggtitle("ROC Curves from Multiple Iterations")
return(list(AverageAUC = mean_auc, ROCPlot = roc_plot))
}
# Example usage
results.5.100 <- fit_and_evaluate(data, times = 40, sample_size = 100, test_size = .7, perc_ones = 0.5)
results.5.200 <- fit_and_evaluate(data, times = 40, sample_size = 200, test_size = .7, perc_ones = 0.5)
results.5.400 <- fit_and_evaluate(data, times = 40, sample_size = 400, test_size = .7, perc_ones = 0.5)
# Print average AUC
print(results.5.100$AverageAUC)
## [1] 0.7752198
print(results.5.200$AverageAUC)
## [1] 0.7805253
print(results.5.400$AverageAUC)
## [1] 0.7937877
# Plot ROC curves
print(results.5.100$ROCPlot)
print(results.5.200$ROCPlot)
print(results.5.400$ROCPlot)
# Example usage
results.20.100 <- fit_and_evaluate(data, times = 40, sample_size = 100, test_size = .7, perc_ones = 0.2)
results.20.200 <- fit_and_evaluate(data, times = 40, sample_size = 200, test_size = .7, perc_ones = 0.2)
results.20.400 <- fit_and_evaluate(data, times = 40, sample_size = 400, test_size = .7, perc_ones = 0.2)
# Print average AUC
print(results.20.100$AverageAUC)
## [1] 0.7464821
print(results.20.200$AverageAUC)
## [1] 0.7534212
print(results.20.400$AverageAUC)
## [1] 0.7882064
# Plot ROC curves
print(results.20.100$ROCPlot)
print(results.20.200$ROCPlot)
print(results.20.400$ROCPlot)
# Example usage
results.05.100 <- fit_and_evaluate(data, times = 40, sample_size = 100, test_size = .7, perc_ones = 0.05)
results.05.200 <- fit_and_evaluate(data, times = 40, sample_size = 200, test_size = .7, perc_ones = 0.05)
results.05.400 <- fit_and_evaluate(data, times = 40, sample_size = 400, test_size = .7, perc_ones = 0.05)
# Print average AUC
print(results.05.100$AverageAUC)
## [1] 0.667671
print(results.05.200$AverageAUC)
## [1] 0.702614
print(results.05.400$AverageAUC)
## [1] 0.7497505
# Plot ROC curves
print(results.05.100$ROCPlot)
print(results.05.200$ROCPlot)
print(results.05.400$ROCPlot)